# constricting dataset for analysis
# Economic data from world bank, plus the R&P replication data from:
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/26391
# Input: raw data
# Output
rm(list = ls())
options(stringsAsFactors = FALSE)
seed_to_use <- 614
set.seed(seed_to_use)
library(data.table)
library(parallel)
library(haven) # used for reading in stata code
library(bit64)
# Kurtz/Arce data
d <- read_stata("kurtz_arce_replication.dta")
d <- as.data.table(d)

# Data from world bank: economic characteristics
bop <- fread(input = 
    "balance_payments_usd.csv", 
  sep = ",", header = TRUE)
cab_usd <- fread(input = 
    "current_account_usd.csv", 
  sep = ",", header = TRUE)
cab_pct_gdp <- fread(input = 
    "current_account_pct_gdp.csv", 
  sep = ",", header = TRUE)
pop <- fread(input = 
    "population.csv", 
  sep = ",", header = TRUE)
gdp <- fread(input = 
    "gdp_2010_usd.csv", 
  sep = ",", header = TRUE)
debt_pct_gdp <- fread(input = 
    "debt_pct_gdp.csv", 
  sep = ",", header = TRUE)
trade_balance <- fread(input = 
    "trade_balance.csv", 
  sep = ",", header = TRUE)

# extracting the names for all of the countries, and creating identifiers

countries <- d[,.(case, bol, bra, chi, col, cri, dom, ecu, sal, gua, hon, jam, 
  mex, par, per, uru, ven)]
# removing all rows w/NaNs - boliva just as example
countries <-countries[ !(is.nan(countries$bol)) ] 

# creating country key
countries <- unique(countries)
name_code <- data.table(cbind(unique(countries$case), c("arg", names(countries)[2:17])))
name_code$country <- c("Argentina", "Bolivia", "Brazil", "Chile", "Colombia", 
  "Costa Rica", "Dominican Republic", "Ecuador", "El Salvador", "Guatemala",
  "Honduras", "Jamaica", "Mexico", "Paraguay", "Peru", "Uruguay", "Venezuela")

# transforming WB data, to something more useful more analysis
# wrote a function, since copying and pasting would be repetitive

# inputs: the WB data, and the name for the resulting variables
# output: 3 column data table (country, year, variable)
wb_data_transform <- function(wb_data = NA, 
  country_match = name_code$country, var_name = NA){
  wb_data$`Country Name` <- gsub(pattern = "Venezuela, RB", 
    replacement = "Venezuela", 
    x = wb_data$`Country Name`)
  wb_data <- wb_data[ wb_data$`Country Name` %in% country_match ]
  wb_melt <- melt(wb_data, id.vars = "Country Name", measure.vars = 5:61)
  wb_melt[, value := as.numeric(value)]
  setnames(wb_melt, c("country", "year", var_name))
  wb_melt$year <- as.numeric(as.character(wb_melt$year))
  wb_melt
}
bop_melt <- wb_data_transform(
  wb_data = bop, var_name = "balance_of_payment" )
cab_usd_melt <- wb_data_transform(
  wb_data = cab_usd, var_name = "cab_usd" )
cab_pct_melt <- wb_data_transform(
  wb_data = cab_pct_gdp, var_name = "cab_pct_gdp")
gdp_melt <- wb_data_transform(
  wb_data = gdp, var_name = "gdp_2010")
pop_melt <- wb_data_transform(
  wb_data = pop, var_name = "pop_count")
debt_melt <- wb_data_transform(
  wb_data = debt_pct_gdp, var_name = "debt_pct_gdp")
trade_melt <- wb_data_transform(
  wb_data = trade_balance, var_name = "ext_trade_bal")
# setting the key, to merge all of the WB datasets
setkey(bop_melt, country, year)
setkey(cab_usd_melt, country, year)
setkey(cab_pct_melt, country, year)
setkey(gdp_melt, country, year)
setkey(pop_melt, country, year)
setkey(debt_melt, country, year)
setkey(trade_melt, country, year)

# mergeing all of the data - data.table sytax
new_dt <- trade_melt[bop_melt[cab_usd_melt[cab_pct_melt[gdp_melt[pop_melt[debt_melt]]]]]]

# getting the variables we want from solt/arce/kurtz
solt_data <- d[ ,.(year, case, e3, strikes, riots, antidemo, demo)]
solt_data <- solt_data[ year < 2004]
solt_data$year <- as.numeric(solt_data$year)
solt_data$e3[solt_data$e3 == "NaN"] <- NA #fixing a STATA artifact
# changing variable modes
solt_data[, democ := as.numeric(demo)]
solt_data[, e3 := as.numeric(e3)]
solt_data[, strikes:= as.integer(strikes)]
solt_data[, riots:= as.integer(riots)]
solt_data[, antidemo:= as.integer(antidemo)]
solt_data[, total_protests := strikes + riots + antidemo ]
solt_data[, case := as.character(case)]

# dealing w/ empty spaces (missing vals are "", not NA, which R doesn't like)
new_dt <- new_dt[ year %in% solt_data$year]
new_dt$balance_of_payment <- ifelse(nchar(new_dt$balance_of_payment) == 0, 
  NA, new_dt$balance_of_payment)
new_dt$cab_usd <- ifelse(nchar(new_dt$cab_usd) == 0, 
  NA, new_dt$cab_usd)
new_dt$cab_pct_gdp <- ifelse(nchar(new_dt$cab_pct_gdp) == 0, 
  NA, new_dt$cab_pct_gdp)
new_dt$gdp <- ifelse(nchar(new_dt$gdp) == 0, 
  NA, new_dt$gdp)
new_dt$pop <- ifelse(nchar(new_dt$pop) == 0, 
  NA, new_dt$pop)
new_dt$debt_pct_gdp <- ifelse(nchar(new_dt$debt_pct_gdp) == 0, 
  NA, new_dt$debt_pct_gdp)
new_dt$ext_trade_bal <- ifelse(nchar(new_dt$ext_trade_bal) == 0, 
  NA, new_dt$ext_trade_bal)

# making final dataset
new_dt2 <- new_dt[name_code, on = "country"]
setnames(new_dt2, old = c("V1", "V2"), new = c("case", "abbrev"))
protests_econ_data <- new_dt2[solt_data, on = c("case", "year")]
protests_econ_data[, gdp_pc := as.numeric(gdp_2010)/as.numeric(pop_count) ]
protests_econ_data[, standardize_bop := 
    (balance_of_payment/gdp_2010), by = year]

# making final dataset w/lags
protests_econ_data[,`:=` (
  lag1_standardize_bop = shift(standardize_bop, n = 1),
  lag1_balance_of_payment = shift(balance_of_payment, n = 1),
  lag1_cab_usd = shift(cab_usd, n = 1), 
  lag1_cab_pct_gdp = shift(cab_pct_gdp, n = 1),
  lag1_gdp_2010 = shift(gdp_2010, n = 1), 
  lag1_pop_count = shift(pop_count, n = 1),
  lag1_debt_pct_gdp = shift(debt_pct_gdp, n = 1), 
  lag1_e3 = shift(e3, n = 1),
  lag1_gdp_pc = shift(gdp_pc, n = 1),
  lag1_democ = shift(democ, n = 1),
  lag1_trade = shift(ext_trade_bal, n = 1),
  lag2_trade = shift(ext_trade_bal, n = 2),
  lag2_standardize_bop = shift(standardize_bop, n = 2),
  lag2_balance_of_payment = shift(balance_of_payment, n = 2),
  lag2_cab_usd = shift(cab_usd, n = 2), 
  lag2_cab_pct_gdp = shift(cab_pct_gdp, n = 2),
  lag2_gdp_2010 = shift(gdp_2010, n = 2), 
  lag2_pop_count = shift(pop_count, n = 2),
  lag2_debt_pct_gdp = shift(debt_pct_gdp, n = 2), 
  lag2_e3 = shift(e3, n = 2),
  lag2_democ = shift(democ, n = 2),
  lag2_gdp_pc = shift(gdp_pc, n = 2)
  ), by = country]
save(protests_econ_data, file = 
  "protests_econ_data.rdata")


